## Data Cleaning

# load raw data files
data = read.csv("../data/filledDatabase10042019NUMonlyREDUCED.csv")

# clean data 
data = clean_data(data)

# separate compound and group_cate from the predictors
compound = data$Compound
group_cat = data$GroupCat
group_cat_text = paste("Grp", group_cat)
data = select(data, -c("Compound","GroupCat"))

data_biplot = data

PCA

PCA circle

fviz_pca_var(
  prcomp(data, scale = TRUE),
  col.var = "contrib", # Color by contributions to the PC
  gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
  repel = TRUE     # Avoid text overlapping
)
Vectors of predictors in the space of PC1 and PC2

Figure 1: Vectors of predictors in the space of PC1 and PC2

PCA importance

Here we choose first 17 PC for further analysis.

# print out the cumulative proportions for first k PC's
print_pca_importance(data, k = 20)
##                             PC1      PC2      PC3     PC4      PC5
## Standard deviation     3.743427 3.089436 2.520277 2.41347 2.047305
## Proportion of Variance 0.237510 0.161770 0.107660 0.09873 0.071040
## Cumulative Proportion  0.237510 0.399290 0.506940 0.60567 0.676710
##                             PC6      PC7     PC8      PC9     PC10
## Standard deviation     1.904534 1.609677 1.36274 1.332563 1.120081
## Proportion of Variance 0.061480 0.043920 0.03148 0.030100 0.021260
## Cumulative Proportion  0.738190 0.782110 0.81358 0.843680 0.864940
##                            PC11     PC12      PC13      PC14      PC15
## Standard deviation     1.037753 1.019394 0.8502572 0.8243994 0.7964395
## Proportion of Variance 0.018250 0.017610 0.0122500 0.0115200 0.0107500
## Cumulative Proportion  0.883200 0.900810 0.9130600 0.9245800 0.9353300
##                             PC16      PC17     PC18      PC19      PC20
## Standard deviation     0.7065705 0.6779897 0.639666 0.5899257 0.5650318
## Proportion of Variance 0.0084600 0.0077900 0.006940 0.0059000 0.0054100
## Cumulative Proportion  0.9437900 0.9515900 0.958520 0.9644200 0.9698300
# transform data to a new space
data_pca = get_pc_space(data, k = 17) %>% scale()

Compounds in space of PC’s

set_color = c("#0071C3","#DE501A","#EEB020","#7E2E8E","#79AC2C","#4DBDF7","#A51331") %>% 
  rep(10)

data.frame(Compound = compound, GroupCat = group_cat_text, data_pca) %>%
  ggplot(aes(x=PC1, y=PC2, color = GroupCat)) +
  geom_point(aes(color = GroupCat), size = 2, alpha = 0.4) +
  geom_text(aes(label=Compound, color=GroupCat), size = 3) +
  scale_color_manual(values=set_color) +
  scale_fill_manual(values=set_color) +
  scale_shape_manual(values=1:11) +
  theme_minimal()
Compounds in the space of the first two PC's

Figure 2: Compounds in the space of the first two PC’s

PCA biplot

library(RColorBrewer)
rownames(data_biplot) = make.names(compound, unique=TRUE)
fit <- princomp(data_biplot, cor=TRUE)
fviz_pca_biplot(fit, aesx = c(1,2),
                # individual
                label = "var", labelsize = 4,
                geom = c("point","text"), fill.ind = group_cat_text, alpha.ind = 0.7,
                pointsize = 2, pointshape = 21, palette = set_color[1:11],
                # variable
                col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel=TRUE) +
  labs(fill = "Group Cat", color = "Contrib")
PCA biplot (compounds and predictor vectors in the space of the first two PC's)

Figure 3: PCA biplot (compounds and predictor vectors in the space of the first two PC’s)

Cubic vs Tilted

2D

data.frame(Compound = compound, GroupCat = group_cat_text, data_pca) %>%
  filter(GroupCat %in% c("Grp 3","Grp 5")) %>%
  ggplot(aes(x=PC1, y=PC2, color = GroupCat)) +
  # geom_point(aes(color = GroupCat), size = 4, alpha = 0.4) +
  geom_text(aes(label=Compound, color=GroupCat), size = 3, alpha = 0.7) +
  scale_color_manual(values=set_color) +
  scale_fill_manual(values=set_color) +
  scale_shape_manual(values=1:11) +
  theme_minimal()
Cubic and Tilted in the space of the first two PC's

Figure 4: Cubic and Tilted in the space of the first two PC’s

3D

data.frame(Compound = compound, GroupCat = group_cat_text, data_pca) %>%
  filter(GroupCat %in% c("Grp 3","Grp 5")) %>%
  droplevels() %>%
  plot_ly(x = ~PC1, y = ~PC2, z = ~PC3, color = ~GroupCat, text = ~Compound, colors = set_color[1:2]) %>%
  add_markers(marker=list(size = 7, opacity = 0.6)) %>%
  layout(scene = list(xaxis = list(title = 'PC1'),
                      yaxis = list(title = 'PC2'),
                      zaxis = list(title = 'PC3')),
         title = "Cubic and Titled in the space of PC1, PC2, PC3")

Figure 5: Cubic and Tilted in the space of the first two PC’s

Biplot

fit = princomp(data_biplot, cor=TRUE)

new_color = c()

for(i in 1:11){
  group_tag = levels(as.factor(group_cat_text))[i]
  if(group_tag %in% c("Grp 3","Grp 5")){
    new_color[i] = set_color[i+2]
  }else{
    new_color[i] = "white"
  }
}

fviz_pca_biplot(fit, aesx = c(1,2),
                # individual
                label = "var", labelsize = 4,
                geom = c("point","text"), fill.ind = group_cat_text, col.ind = "white", alpha.ind = 0.7,
                pointsize = 2, pointshape = 21, palette = new_color,
                # variable
                col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel=TRUE) +
  labs(fill = "Group Cat", color = "Contrib", title = "PCA biplot - Cubic and Tilted")

# data_3 = data[group_cat_text %in% c("Grp 3"),] %>% mutate(GroupCat = "Grp 3")
# data_5 = data[group_cat_text %in% c("Grp 5"),] %>% mutate(GroupCat = "Grp 5")
# 
# data_3_5 = rbind(data_3, data_5)
# ggplot(melt(data_3_5[,c(49:59,60)], id.vars="GroupCat"), aes(value)) + 
#   geom_histogram(bins = 20, aes(fill=GroupCat), position = "dodge", color = "grey", size = 0.1) + 
#   facet_wrap(~variable, scales = 'free_x') +
#   scale_fill_manual(values = set_color) +
#   theme_bw()

K-means

k = 2

kmean_results = get_kmeans_results(data_pca, k = 2)
get_kmeans_visual_2D(kmean_results)

get_kmeans_histogram(kmean_results)

k = 3

kmean_results = get_kmeans_results(data_pca, k = 3)
get_kmeans_visual_2D(kmean_results)

get_kmeans_histogram(kmean_results)

k = 4

kmean_results = get_kmeans_results(data_pca, k = 4)
get_kmeans_visual_2D(kmean_results)

get_kmeans_histogram(kmean_results)

k = 5

kmean_results = get_kmeans_results(data_pca, k = 5)
get_kmeans_visual_2D(kmean_results)

get_kmeans_histogram(kmean_results)

k = 6

kmean_results = get_kmeans_results(data_pca, k = 6)
get_kmeans_visual_2D(kmean_results)

get_kmeans_histogram(kmean_results)

k = 7

kmean_results = get_kmeans_results(data_pca, k = 7)
get_kmeans_visual_2D(kmean_results)

get_kmeans_histogram(kmean_results)

k = 8

kmean_results = get_kmeans_results(data_pca, k = 8)
get_kmeans_visual_2D(kmean_results)

get_kmeans_histogram(kmean_results)

k = 9

kmean_results = get_kmeans_results(data_pca, k = 9)
get_kmeans_visual_2D(kmean_results)

get_kmeans_histogram(kmean_results)

k = 10

kmean_results = get_kmeans_results(data_pca, k = 10)
get_kmeans_visual_2D(kmean_results)

get_kmeans_histogram(kmean_results)

k = 11

kmean_results = get_kmeans_results(data_pca, k = 11)
get_kmeans_visual_2D(kmean_results)

get_kmeans_histogram(kmean_results)

k = 12

kmean_results = get_kmeans_results(data_pca, k = 12)
get_kmeans_visual_2D(kmean_results)

get_kmeans_histogram(kmean_results)

k = 13

kmean_results = get_kmeans_results(data_pca, k = 13)
get_kmeans_visual_2D(kmean_results)

get_kmeans_histogram(kmean_results)

k = 14

kmean_results = get_kmeans_results(data_pca, k = 14)
get_kmeans_visual_2D(kmean_results)

get_kmeans_histogram(kmean_results)

k = 15

kmean_results = get_kmeans_results(data_pca, k = 15)
get_kmeans_visual_2D(kmean_results)

get_kmeans_histogram(kmean_results)

k = 16

kmean_results = get_kmeans_results(data_pca, k = 16)
get_kmeans_visual_2D(kmean_results)

get_kmeans_histogram(kmean_results)

k = 20

kmean_results = get_kmeans_results(data_pca, k = 20)
get_kmeans_visual_2D(kmean_results)

get_kmeans_histogram(kmean_results)

k = 24

kmean_results = get_kmeans_results(data_pca, k = 24)
get_kmeans_visual_2D(kmean_results)

get_kmeans_histogram(kmean_results)

k = 25

kmean_results = get_kmeans_results(data_pca, k = 25)
get_kmeans_visual_2D(kmean_results)

get_kmeans_histogram(kmean_results)

### explore k visually

# choose the number of clusters to explore
# k = 4
# 
# # build a model for kmeans
# kmean_results = get_kmeans_results(data_pca, k = k)
# 
# # generate 2D visual for this k 
# get_kmeans_visual_2D(kmean_results)
# 
# # histogram matlab 
# get_kmeans_histogram(kmean_results)

# generate 3D visual for this k
# get_kmeans_visual_3D(kmean_results)
### save results 

# choose k to save
# k = c(2:25)

# save the result as csv
# save_kmeans_results(data, data_pca, k)
knitr::opts_chunk$set(
    fig.width = 10,
    fig.height = 5
)

Mclust

Based on a criterion BIC, we choose a Gaussian mixture model with 3 components.

### mclust visuals

# build a model for mclust
model = get_mclust_model(data_pca)

# print summary table of the model
summary(model)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VVV (ellipsoidal, varying volume, shape, and orientation) model
## with 3 components: 
## 
##  log-likelihood   n  df       BIC      ICL
##       -4453.406 311 512 -11845.59 -11845.6
## 
## Clustering table:
##   1   2   3 
## 134 142  35
# generate 2D visual for mclust
get_mclust_visual_2D(model)

# histogram matlab 
get_mclust_histogram(model)

# generate 3D visual for mclust
# get_mclust_visual_3D(model)
### save results 

# save_mclust_results(data, data_pca, compound, group_cat)

Neural Nets

  • Classify all data points to 6 largest groups
  • 4-hidden-layer model with leave-one-out achieves classification rate 54%

x

# # check any missing values in chem group
# group_cat %>% check_missing()
# 
# # dummy first k chem group
# group_cat_dummy = dummy_group_cat(group_cat, k = 5)
# 
# # normalize data_pca
# data_pca_scaled = scale(data_pca) %>% as.data.frame()
# 
# # compute cv classification rate
# get_nn_cv_rate(group_cat_dummy, data_pca_scaled) %>% mean()

# build a model for neural nets
# model = get_nn_model(group_cat_dummy, data_pca_scaled)

# plot neural nets
# library(devtools)
# source_url('https://gist.githubusercontent.com/fawda123/7471137/raw/466c1474d0a505ff044412703516c34f1a4684a5/nnet_plot_update.r')
# plot.nnet(model)

# save nn results 
# save_nn_results(model, compound, group_cat, data, names(group_cat_dummy))

# save model weights (layers = 20,10,5)
# data.frame(weight = model$result.matrix) %>% 
#   rownames_to_column() %>% 
#   filter(between(row_number(), 4, n())) %>% 
#   separate(rowname, c("node_in","node_out"), sep = ".to.") %>%
#   write.csv("../result/nn weights.csv", row.names = FALSE)
### explore for k 

# k = c(3:11)
# 
# dummy = list()
# for(i in 1:length(k)){
#   dummy[[i]] = repeat_B(k = k[i])
# }
# 
# data.frame(dummy) %>% 
#   set_colnames(paste("k=", k, sep="")) %>%
#   reshape2::melt() %>%
#   ggplot(aes(x = value)) +
#   geom_histogram(bins = 30) +
#   labs(x="Classification Rate", y="Frequency") +
#   theme_bw() + coord_flip() +
#   facet_grid(~variable)


# data.frame(dummy) %>% set_colnames(paste("k=", k, sep="")) %>%
#   write.csv("../result/nn class rate.csv", row.names = FALSE)